# 导入PyMuPDFLoader, 用于加载PDF文件
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(file_path="llama2.pdf")
pages = loader.load_and_split()
# print(pages[0].page_content)

# 导入RecursiveCharacterTextSplitter, 用于将文档分割成更小的块
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,  # 每个块的大小
    chunk_overlap=100,  # 每个块重叠的大小
    length_function=len,  # 用于计算每个块的长度
    add_start_index=True)  # 是否添加开始索引

# 使用text_splitter分割文档
paragraphs = text_splitter.create_documents([pages[0].page_content])

for p in paragraphs:
    print(p.page_content)
    print("-"*30)


